Final-Blog-1.utf8

Are they tanking in Texas???

By: Laura Stickells

April 28, 2021

Step 1: Load the recruit data

Data for cfbscrapR comes from https://collegefootballdata.com/. I filter the data set to only include committed recruits.

Sys.setenv(CFBD_API_KEY = "Z5JDzujwt45dUw9h/GVgDo4aJOnvY6QUSNm0xGyKDPeRBfoltU1afsqnuOyEjw95")
  
seasons <- 2000:2020
recruitData <- data.frame()
for(i in seasons) {
  season <- i
  recruit <-cfbd_recruiting_player(
  year = i,
  recruit_type = "HighSchool",
  )
  recruit$hometown_latitude <- recruit$hometownInfo$latitude
  recruit$hometown_longitude <- recruit$hometownInfo$longitude
  recruit$hometown_fipscode <- recruit$hometownInfo$fipsCode
  recruit <- recruit[ -c(17) ] #if there is a bug, check the column index.
  recruitData <- rbind(recruit, recruitData)
}

## 2021-04-28 02:51:00: Scraping player recruiting data...
## 2021-04-28 02:51:00: Scraping player recruiting data...
## 2021-04-28 02:51:00: Scraping player recruiting data...

## 2021-04-28 02:51:01: Scraping player recruiting data...
## 2021-04-28 02:51:01: Scraping player recruiting data...
## 2021-04-28 02:51:01: Scraping player recruiting data...
## 2021-04-28 02:51:01: Scraping player recruiting data...

## 2021-04-28 02:51:02: Scraping player recruiting data...
## 2021-04-28 02:51:02: Scraping player recruiting data...
## 2021-04-28 02:51:02: Scraping player recruiting data...

## 2021-04-28 02:51:03: Scraping player recruiting data...

## 2021-04-28 02:51:04: Scraping player recruiting data...

## 2021-04-28 02:51:05: Scraping player recruiting data...
## 2021-04-28 02:51:05: Scraping player recruiting data...

## 2021-04-28 02:51:06: Scraping player recruiting data...
## 2021-04-28 02:51:06: Scraping player recruiting data...

## 2021-04-28 02:51:07: Scraping player recruiting data...
## 2021-04-28 02:51:07: Scraping player recruiting data...
## 2021-04-28 02:51:07: Scraping player recruiting data...

## 2021-04-28 02:51:08: Scraping player recruiting data...
## 2021-04-28 02:51:08: Scraping player recruiting data...

rm(recruit, season, seasons)
recruitData <- recruitData %>%
  filter(!is.na(committed_to))

Step 2: Pull all of the draft data since 2000

Data comes from PFR. Inspo from https://github.com/paulg66/NFL_Draft/blob/master/Draft%20Results%20Scrape.R

urlPrefix <- "https://www.pro-football-reference.com/years/"
urlEnd<- "/draft.htm"
startyear <- 2000
endyear <- 2020

draftData <- NULL

for (i in startyear:endyear){
  year <- i
  url <- paste(urlPrefix,as.character(year),urlEnd,sep = '') #Build URL with increased year
  query <- getURL(url)
  query <- readHTMLTable(query, stringsAsFactors = F) #pull data
  tempDraftData <- query$drafts
  tempDraftData$DraftYear <- year #add draft year
  draftData <- rbind(draftData,tempDraftData) #rbind to main dataset
}
rm(tempDraftData)

draftData <- draftData[draftData$Rnd != "Rnd",] #remove title rows
draftData$Player <- str_remove(draftData$Player,"HOF") #remove HOFs
draftData$Player <- str_trim(draftData$Player,"right") #remove spaces at end of names

draftData$Pick <- sapply(draftData$Pick, as.numeric) #fix format
draftData$Rnd <- sapply(draftData$Rnd, as.numeric)
draftData$Age <- sapply(draftData$Age, as.numeric)

# Fix school names to match recruitData

draftData$`College/Univ` <- gsub("St.", "State", draftData$`College/Univ`, fixed=TRUE)
draftData$`College/Univ` <- gsub("Col.", "College", draftData$`College/Univ`, fixed=TRUE)
draftData$`College/Univ` <- gsub(" (FL)", "", draftData$`College/Univ`, fixed=TRUE)
draftData$`College/Univ` <- gsub("North Carolina State", "NC State", draftData$`College/Univ`, fixed=TRUE)
draftData$`College/Univ` <- gsub("Mississippi", "Ole Miss", draftData$`College/Univ`, fixed=TRUE)
draftData$`College/Univ` <- gsub("North Carolina State", "NC State", draftData$`College/Univ`, fixed=TRUE)
draftData$`College/Univ` <- gsub("Ole Miss State", "Mississippi State", draftData$`College/Univ`, fixed=TRUE)
draftData$`College/Univ` <- gsub("Hawaii", "Hawai'i", draftData$`College/Univ`, fixed=TRUE)
draftData$`College/Univ` <- gsub("Central Florida", "UCF", draftData$`College/Univ`, fixed=TRUE)
draftData$`College/Univ` <- gsub("Southern Miss", "Southern Mississippi", draftData$`College/Univ`, fixed=TRUE)
draftData$`College/Univ` <- gsub("Cal Poly-San Luis Obispo", "Cal Poly", draftData$`College/Univ`, fixed=TRUE)
draftData$`College/Univ` <- gsub("California-Davis", "UC Davis", draftData$`College/Univ`, fixed=TRUE)
draftData$`College/Univ` <- gsub("Massachusetts", "UMass", draftData$`College/Univ`, fixed=TRUE)

draftData <- draftData[ -c(29) ]

cols = names(draftData)
cols[1] = 'Rnd'
cols[2] = 'Pick'
cols[3] = 'Team'
cols[4] = 'Player'
cols[5] = 'Pos'
cols[6] = 'Age'
cols[7] = 'Last Year'
cols[8] = '1st Team All Pro'
cols[9] = 'Pro Bowl'
cols[10] = 'Years as Starter'
cols[11] = 'Career AV'
cols[12] = 'Draft Team AV'
cols[13] = 'Games Played'
cols[14] = 'Passes Completed'
cols[15] = 'Pass Attempts'
cols[16] = 'Passing Yds'
cols[17] = 'Passing TDs'
cols[18] = 'Interceptions Thrown'
cols[19] = 'Rushing Attepts'
cols[20] = 'Rushing Yds'
cols[21] = 'Rushing TDs'
cols[22] = 'Receptions'
cols[23] = 'Recieving Yds'
cols[24] = 'Recieving TDs'
cols[25] = 'Solo Tackles'
cols[26] = 'Interceptions (Def)'
cols[27] = 'Sacks (Def)'
cols[28] = 'College/Univ'
cols[29] = 'Draft Year'
names(draftData) = cols

Step 3: Load School Data

teamInfo <- cfbd_team_info(conference = NULL, only_fbs = TRUE, year = NULL)
logos <- data.frame(matrix(unlist(teamInfo$logos), nrow=length(teamInfo$logos), byrow=TRUE),stringsAsFactors=FALSE)
teamInfo <- cbind(teamInfo, logos) %>%
  rename(
    logo1 = X1,
    logo2 = X2
    )
teamInfo <- teamInfo[ -c(12) ]
rm(logos)

Step 4: Rank schools based on number of picks since 2005

draftSuccessTable <- draftData %>%
  filter(`Draft Year` >= 2005) %>%
  group_by(`College/Univ`) %>%
  summarise(
    avgPick = mean(Pick),
    numPicks = n(),
    avgAge = mean(Age, na.rm = TRUE)
  ) %>%
  mutate(
    draftRank = trunc(rank(-numPicks))
  )

Step 5: Rank schools based on average recruit rating since 2005

recruitSuccessTable <- recruitData %>%
  filter(year >= 2005, ranking <= 224) %>%
  mutate(
    fiveStar = case_when(
      stars == 5 ~ 1,
      stars != 5 ~ 0
    )) %>%
  group_by(committed_to) %>%
  summarise(
    numRecruits = n(),
    avgRating = mean(rating),
    fiveStars = sum(fiveStar)
  ) %>%
  mutate(
    recruitRank = trunc(rank(-numRecruits))
  )

Step 6: Bind the tables by school, find the difference between the rankings and filter the schools only with 20 or more draft picks since 2000. Then build a table looking at the top 5.

recruitXdraft <- merge(x = recruitSuccessTable, y = draftSuccessTable, by.x = "committed_to", by.y = "College/Univ", all = TRUE) %>%
  mutate(
    glowUp = recruitRank - draftRank
  ) %>%
  filter(numPicks >= 20)

recruitXdraft %>%
  filter(recruitRank <= 5) %>%
  select(committed_to, numRecruits, recruitRank, numPicks, draftRank) %>%
  arrange(recruitRank) %>%
  gt() %>%
    cols_align(
    align = "center",
    columns = c(3,5)
  ) %>%
  tab_header(
    title = "Top 5 recruiting schools and their draft rankings",
    subtitle = "Since 2005"
  ) %>% 
  tab_options(
    table.border.top.color = "white",
    row.striping.include_table_body = FALSE
  ) %>%
  tab_source_note(
    source_note = "SOURCE: @cfbfastR and @pfrer"
  ) %>%
  cols_label(
    committed_to = "SCHOOL",
    numRecruits = "TOP 224 RECRUITS",
    recruitRank = "RECRUITING RANK",
    numPicks = "DRAFT PICKS",
    draftRank = "DRAFT RANK",
  ) %>%
  tab_style(
    style = list(
      cell_fill(color = "lightyellow")
  ), 
    locations = cells_body(
      columns = vars(committed_to, numRecruits, recruitRank, numPicks, draftRank),
      rows = committed_to == "Texas")
  ) %>%
  tab_style(
    style = list(
      cell_text(weight = "bold")
      ),
    locations = cells_body(
      columns = vars(committed_to, numRecruits, recruitRank, numPicks, draftRank),
      rows = committed_to == "Texas")
  )

## Warning: The `.dots` argument of `group_by()` is deprecated as of dplyr 1.0.0.

Top 5 recruiting schools and their draft rankings
Since 2005
SCHOOL	TOP 224 RECRUITS	RECRUITING RANK	DRAFT PICKS	DRAFT RANK
Alabama	219	1	109	1
Texas	183	2	58	17
Ohio State	181	3	101	3
USC	181	3	93	4
Georgia	178	5	86	6
SOURCE: @cfbfastR and @pfrer

Step 7: Build a visualization showing how Texas’ draft success compares to other schools with top 5 talent… turned this into an R-Shiny later.

topFive <- recruitXdraft %>%
  filter(recruitRank <= 5)

draftData_graph <- draftData %>%
  filter(`College/Univ` %in% topFive$committed_to) %>%
  group_by(`College/Univ`, `Draft Year`) %>%
  summarise(
    numPicks = n()
  )

## `summarise()` has grouped output by 'College/Univ'. You can override using the `.groups` argument.

schools <- unique(draftData_graph$`College/Univ`)
years <- unique(draftData$`Draft Year`)

seasonsHack <- data.frame(school = sort(rep(schools, length(years))),
                 year = rep(years, length(schools)))

draftData_graph1 <- merge(x = seasonsHack, y = draftData_graph, by.x = c("school","year"), by.y = c("College/Univ", "Draft Year"),  all = TRUE)
draftData_graph1[is.na(draftData_graph1)] <- 0

draftData_graph1 %>%
  left_join(teamInfo, by = c('school' = 'school')) %>%
  ggplot(aes(x = year, y = numPicks, group = school)) +
  geom_line(aes(color = color), size = 1) +
  geom_point(aes(color = color), alpha=.7) +
  scale_colour_identity() +
  geom_hline(yintercept = (mean(topFive$numPicks)/16), color = "black", linetype = "dashed") +
  xlim(2005, 2020) +
  ylim(0, 15) +
  theme_fivethirtyeight() +
  theme(
    legend.title = element_blank(),
    strip.text = element_text(face = "bold"),
    axis.title.y = element_text(),
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 45, vjust = .7)
    ) +
  labs(
    y = "Number of Draft Picks",
    title = "Draft picks by season",
    subtitle = "Dotted line represents average draft picks for schools with top 5\nrecruiting from 2005 to 2020",
    caption = "Data: @cfbfastR | Plot: @LauraStickells"
  )

## Warning: Removed 25 row(s) containing missing values (geom_path).

## Warning: Removed 25 rows containing missing values (geom_point).

Step 8: Build a visualization that shows where Texas gets most of its top 100 recruits

texasMapData1 <- recruitData %>%
  left_join(teamInfo, by = c('committed_to' = 'school')) %>%
  filter(committed_to %in% c("Texas"), ranking <= 224, year >= 2005)

seasonsHack <- data.frame(year = 2005:2020)

texasMapData1 <- merge(texasMapData1, seasonsHack, by="year", all.x=T, all.y=T)
texasMapData1$hometown_latitude[is.na(texasMapData1$hometown_latitude)] <- 33
texasMapData1$hometown_longitude[is.na(texasMapData1$hometown_longitude)] <- -96

texasMapData1$hometown_longitude <- sapply(texasMapData1$hometown_longitude, as.numeric)
texasMapData1$hometown_latitude <- sapply(texasMapData1$hometown_latitude, as.numeric)

texasMapData1Transformed <- usmap_transform(texasMapData1[ c(18, 17) ])

## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj =
## prefer_proj): Discarded datum unknown in Proj4 definition

texasMapData1 <- texasMapData1 %>%
  left_join(texasMapData1Transformed, by = c('hometown_longitude', 'hometown_latitude'))

asp_ratio <- 1.618

TexasRecruitsMap1 <- plot_usmap("states") + 
  geom_image(data = texasMapData1, 
             aes(x = hometown_longitude.1, y = hometown_latitude.1, 
             image = logo1),
             size = 0.07, by = "width", asp = asp_ratio) +
  theme_economist() +
  theme(
    panel.grid = element_blank(),
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.line = element_blank()
    ) +
    labs(
      title = "     Texas commits in 247 Sports' top 224 in {closest_state}",
      subtitle = "",
      caption = "Data: @cfbfastR | Plot: @LauraStickells"
      ) +
    transition_states(
      year,0,30
  )

animate(TexasRecruitsMap1, 
        duration = 60,
       fps = 2,
      height = 6, 
       width = 6,
       units = "in", 
       res = 300
      )

anim_save("Texas Recruits Map.gif")

Step 8: Build a visualization that shows where Texas high school talent goes.

texasGraphData <- recruitData %>%
  filter(state_province %in% c("TX"), ranking <= 224, year >= 2005) %>%
  group_by(committed_to) %>%
  summarise(
    total_commits = n()
  ) 

texasGraphData %>%
  filter(total_commits >= 15) %>%
  left_join(teamInfo, by = c('committed_to' = 'school')) %>%
  ggplot(aes(x = reorder(committed_to, -total_commits) , y = total_commits)) +
  geom_col(aes(fill = color, color = color), alpha = 0.7) +
  geom_label(aes(label = total_commits), fill = "white", vjust = -0.1, size = 3, fontface = "bold") +
  scale_color_identity(aesthetics = c("color", "fill")) +
  ylim(0, 175) +
  theme_fivethirtyeight() +
  theme(
    legend.title = element_blank(),
    strip.text = element_text(face = "bold"),
    axis.title.y = element_text(face = "bold"),
    axis.title.x = element_blank(),
    panel.grid.major.x = element_blank(),
     axis.text.x = element_text(face = "bold")
    ) +
  labs(
    y = "Number of Recruits",
    title = "Where does top in-state talent go?",
    subtitle = "Texas high school talent in 247 Sports' top 224 since 2005",
    caption = "Data: @cfbfastR | Plot: @LauraStickells"
  )

Step 9: Where does Texas rank in terms of high school talent?

recruitsXstate <- recruitData %>%
  filter(year >= 2005, ranking <= 224) %>%
  group_by(state_province) %>%
  summarise(
    n = n()
  )